#!/bin/bash
#
#[seqpipe version="0.3.3 ($Id: bioseq.pipe 127 2012-10-28 16:40:53Z yanll $)"]
#

# Load some useful checkers
. common.inc

# Load global parameters
. config.inc

###########################################################################

#[procedure type="sysinfo"]
function bioseq_tools_version
{
	echo -n 'bwa      : '; bwa |& grep Version | cut -d' ' -f2
	echo -n 'samtools : '; samtools |& grep Version | cut -d' ' -f2-
	echo -n 'bcftools : '; bcftools |& grep Version | cut -d' ' -f2-
	echo -n 'picard   : '; java -jar ${PICARD_ROOT}/ViewSam.jar -h |& grep Version | cut -d' ' -f2
	echo -n 'gatk     : '; java -jar ${GATK_ROOT}/GenomeAnalysisTK.jar --help | grep 'The Genome Analysis Toolkit' | cut -d',' -f1 | cut -d'v' -f2
	echo -n 'pindel   : '; pindel | grep 'Pindel version' | head -n1 | cut -d' ' -f3 | sed 's/,$//'
}

###########################################################################

function fastqc_check
{
	#[command input="${INPUT_FASTQ}"]
	#[command output.final="${OUTPUT}.fastqc.zip"]
	fastqc --noextract --nogroup -o $(dirname ${OUTPUT}.fastqc.zip) ${INPUT_FASTQ} && \
		mv -vf $(dirname ${OUTPUT}.fastqc.zip)/$(basename ${INPUT_FASTQ} \
			| sed 's/\(.fastq\|\)\(.gz\|.bz2\|\)$/_fastqc.zip/g') ${OUTPUT}.fastqc.zip
}

function bwa_build_index
{
	SP_set ALGORITHM="is"
	SP_set ALGORITHM="bwtsw" if_fasta_is_long_genome INPUT_FASTA="${REFERENCE}"

	#[command input="${REFERENCE}" output.final="${REFERENCE}.bwt"]
	bwa index -a ${ALGORITHM} ${REFERENCE}
}

function bwa_read_mapping
{
	SP_set BWA_ALN_OPTS="-I ${BWA_ALN_OPTS}" if_fastq_is_not_base_33 INPUT_FASTQ="${FASTQ_1}"
	SP_set BWA_SAMPE_OPTS=""
	SP_set BWA_END_IND="5"    # do not put an indel within INT bp towards the ends.
	SP_set BWA_GAP_EXT="-1"   # maximum number of gap extensions, -1 for disabling long gaps.
	SP_set RGID="1"
	SP_set SAMPLE_NAME="${RGID}"
	SP_set RGLB="${RGID}"
	SP_set RGPL="illumina"
	SP_set RGCN="BGI"

	# This will be skipped if the index files exist.
	SP_run bwa_build_index REFERENCE="${REFERENCE}"

	SP_parallel_begin
	
	#[command require="${REFERENCE}" input="${FASTQ_1}" output.temp="${FASTQ_1}.sai"]
	bwa aln -t ${THREAD_NUM} -i ${BWA_END_IND} -e ${BWA_GAP_EXT} \
		${REFERENCE} ${FASTQ_1} -f ${FASTQ_1}.sai ${BWA_ALN_OPTS}
	
	#[command require="${REFERENCE}" input="${FASTQ_2}" output.temp="${FASTQ_2}.sai"]
	bwa aln -t ${THREAD_NUM} -i ${BWA_END_IND} -e ${BWA_GAP_EXT} \
		${REFERENCE} ${FASTQ_2} -f ${FASTQ_2}.sai ${BWA_ALN_OPTS}
	
	SP_parallel_end

	#[command require="${REFERENCE}"]
	#[command input="${FASTQ_1}" input="${FASTQ_1}.sai"]
	#[command input="${FASTQ_2}" input="${FASTQ_2}.sai"]
	#[command output.final="${OUTPUT}.bam"]
	bwa sampe -P ${REFERENCE} -a ${MAX_INSERT_SIZE} \
		-r "@RG\tID:${RGID}\tSM:${SAMPLE_NAME}\tLB:${RGLB}\tPL:${RGPL}\tCN:${RGCN}" \
		${FASTQ_1}.sai ${FASTQ_2}.sai ${FASTQ_1} ${FASTQ_2} ${BWA_SAMPE_OPTS} \
		| samtools view -Sb - \
		> ${OUTPUT}.bam
}

function bwa_read_mapping_se
{
	SP_set BWA_ALN_OPTS="-I ${BWA_ALN_OPTS}" if_fastq_is_not_base_33 INPUT_FASTQ="${INPUT_FASTQ}"
	SP_set BWA_SAMSE_OPTS="${BWA_SAMSE_OPTS}"
	SP_set BWA_END_IND="5"    # do not put an indel within INT bp towards the ends.
	SP_set BWA_GAP_EXT="-1"   # maximum number of gap extensions, -1 for disabling long gaps.
	SP_set RGID="1"
	SP_set SAMPLE_NAME="${RGID}"
	SP_set RGLB="${RGID}"
	SP_set RGPL="illumina"
	SP_set RGCN="BGI"

	# This will be skipped if the index files exist.
	SP_run bwa_build_index REFERENCE="${REFERENCE}"

	#[command require="${REFERENCE}"]
	#[command input="${INPUT_FASTQ}" output.temp="${INPUT_FASTQ}.sai"]
	bwa aln -t ${THREAD_NUM} -i ${BWA_END_IND} -e ${BWA_GAP_EXT} \
		${REFERENCE} ${INPUT_FASTQ} -f ${INPUT_FASTQ}.sai ${BWA_ALN_OPTS}

	#[command require="${REFERENCE}"]
	#[command input="${INPUT_FASTQ}" input="${INPUT_FASTQ}.sai"]
	#[command output.final="${OUTPUT}.bam"]
	bwa samse \
		-r "@RG\tID:${RGID}\tSM:${SAMPLE_NAME}\tLB:${RGLB}\tPL:${RGPL}\tCN:${RGCN}" \
		${REFERENCE} ${INPUT_FASTQ}.sai ${INPUT_FASTQ} ${BWA_SAMSE_OPTS} \
		| samtools view -Sb - \
		> ${OUTPUT}.bam
}

#[procedure input="${INPUT_1}.bam"]
#[procedure input="${INPUT_2}.bam"]
#[procedure output="${OUTPUT}.bam"]
function merge_bam
{
	SP_set VALIDATION_STRINGENCY="STRICT"
	SP_set INPUT_3="${INPUT_3}"
	SP_set INPUT_4="${INPUT_4}"
	SP_set INPUT_5="${INPUT_5}"
	SP_set INPUT_6="${INPUT_6}"
	SP_set INPUT_7="${INPUT_7}"
	SP_set INPUT_8="${INPUT_8}"
	SP_set INPUT_9="${INPUT_9}"
	SP_set MORE_INPUT_FILES=""
	SP_set MORE_INPUT_FILES="${MORE_INPUT_FILES} INPUT=${INPUT_3}.bam" if_has_arg ARG="${INPUT_3}"
	SP_set MORE_INPUT_FILES="${MORE_INPUT_FILES} INPUT=${INPUT_4}.bam" if_has_arg ARG="${INPUT_4}"
	SP_set MORE_INPUT_FILES="${MORE_INPUT_FILES} INPUT=${INPUT_5}.bam" if_has_arg ARG="${INPUT_5}"
	SP_set MORE_INPUT_FILES="${MORE_INPUT_FILES} INPUT=${INPUT_6}.bam" if_has_arg ARG="${INPUT_6}"
	SP_set MORE_INPUT_FILES="${MORE_INPUT_FILES} INPUT=${INPUT_7}.bam" if_has_arg ARG="${INPUT_7}"
	SP_set MORE_INPUT_FILES="${MORE_INPUT_FILES} INPUT=${INPUT_8}.bam" if_has_arg ARG="${INPUT_8}"
	SP_set MORE_INPUT_FILES="${MORE_INPUT_FILES} INPUT=${INPUT_9}.bam" if_has_arg ARG="${INPUT_9}"

	#[command input="${INPUT_1}.bam"]
	#[command input="${INPUT_2}.bam"]
	#[command output="${OUTPUT}.bam"]
	#[command output="${OUTPUT}.bai"]
	java ${JAVA_OPTS} -jar ${PICARD_ROOT}/MergeSamFiles.jar ${PICARD_OPTS} \
		VALIDATION_STRINGENCY=${VALIDATION_STRINGENCY} \
		COMPRESSION_LEVEL=9 \
		CREATE_INDEX=true \
		CREATE_MD5_FILE=false \
		INPUT=${INPUT_1}.bam \
		INPUT=${INPUT_2}.bam \
		${MORE_INPUT_FILES} \
		OUTPUT=${OUTPUT}.bam

	#[command require="${OUTPUT}.bai"]
	mv -vf ${OUTPUT}.bai ${OUTPUT}.bam.bai
}

#[procedure input="${INPUT}.bam"]
#[procedure output="${INPUT}.sorted.bam"]
function sort_bam
{
	SP_set VALIDATION_STRINGENCY="STRICT"

	#[command input="${INPUT}.bam"]
	#[command output="${INPUT}.sorted.bam"]
	#[command output="${INPUT}.sorted.bai"]
	java ${JAVA_OPTS} -jar ${PICARD_ROOT}/SortSam.jar ${PICARD_OPTS} \
		VALIDATION_STRINGENCY=${VALIDATION_STRINGENCY} \
		COMPRESSION_LEVEL=9 \
		CREATE_INDEX=true \
		CREATE_MD5_FILE=false \
		SORT_ORDER=coordinate \
		INPUT=${INPUT}.bam \
		OUTPUT=${INPUT}.sorted.bam

	#[command require="${INPUT}.sorted.bai"]
	mv -vf ${INPUT}.sorted.bai ${INPUT}.sorted.bam.bai
}

#[procedure require="${REFERENCE}"]
#[procedure input="${INPUT}.bam"]
#[procedure output="${INPUT}.reordered.bam"]
function reorder_bam
{
	SP_set VALIDATION_STRINGENCY="STRICT"

	#[command input="${INPUT}.bam"]
	#[command output="${INPUT}.reordered.bam"]
	java ${JAVA_OPTS} -jar ${PICARD_ROOT}/ReorderSam.jar ${PICARD_OPTS} \
		VALIDATION_STRINGENCY=${VALIDATION_STRINGENCY} \
		COMPRESSION_LEVEL=9 \
		CREATE_INDEX=true \
		CREATE_MD5_FILE=false \
		INPUT=${INPUT}.bam \
		OUTPUT=${INPUT}.reordered.bam \
		REFERENCE=${REFERENCE}

	# Rename index file if sorted
	[ -f ${INPUT}.reordered.bai ] && mv -vf ${INPUT}.reordered.bai ${INPUT}.reordered.bam.bai
}

#[procedure input="${INPUT}.bam"]
#[procedure output="${INPUT}.flagstat.txt"]
function flagstat_bam
{
	#[command input="${INPUT}.bam" output="${INPUT}.flagstat.txt"]
	samtools flagstat ${INPUT}.bam > ${INPUT}.flagstat.txt
}

#[procedure input="${INPUT}.bam"]
#[procedure output="${INPUT}.uniq.bam"]
#[procedure output="${INPUT}.non_uniq.bam"]
function select_unique_hit_reads
{
	samtools view -h ${INPUT}.bam | tee \
		>(egrep '^@|XT:A:U' | samtools view -Sb - > ${INPUT}.uniq.bam) \
		| egrep -v 'XT:A:U' | samtools view -Sb - > ${INPUT}.non_uniq.bam
}

#[procedure input="${INPUT}.bam"]
#[procedure output="${INPUT}.mkdup.bam"]
function mark_duplicate_reads
{
	SP_set ASSUME_SORTED="false"
	SP_set VALIDATION_STRINGENCY="STRICT"

	#[command input="${INPUT}.bam"]
	#[command output="${INPUT}.mkdup.bam"]
	#[command output="${INPUT}.mkdup.metrics"]
	java ${JAVA_OPTS} -jar ${PICARD_ROOT}/MarkDuplicates.jar ${PICARD_OPTS} \
		VALIDATION_STRINGENCY=${VALIDATION_STRINGENCY} \
		COMPRESSION_LEVEL=9 \
		CREATE_INDEX=true \
		CREATE_MD5_FILE=false \
		INPUT=${INPUT}.bam \
		OUTPUT=${INPUT}.mkdup.bam \
		METRICS_FILE=${INPUT}.mkdup.metrics \
		REMOVE_DUPLICATES=false

	#[command require="${INPUT}.mkdup.bai"]
	mv -vf ${INPUT}.mkdup.bai ${INPUT}.mkdup.bam.bai
}

#[procedure input="${INPUT}.bam"]
#[procedure output="${INPUT}.rmdup.bam"]
function remove_duplicate_reads
{
	SP_set ASSUME_SORTED="false"
	SP_set VALIDATION_STRINGENCY="STRICT"

	#[command input="${INPUT}.bam"]
	#[command output="${INPUT}.rmdup.bam"]
	#[command output="${INPUT}.rmdup.metrics"]
	java ${JAVA_OPTS} -jar ${PICARD_ROOT}/MarkDuplicates.jar ${PICARD_OPTS} \
		VALIDATION_STRINGENCY=${VALIDATION_STRINGENCY} \
		COMPRESSION_LEVEL=9 \
		CREATE_INDEX=true \
		CREATE_MD5_FILE=false \
		INPUT=${INPUT}.bam \
		OUTPUT=${INPUT}.rmdup.bam \
		METRICS_FILE=${INPUT}.rmdup.metrics \
		REMOVE_DUPLICATES=true

	#[command require="${INPUT}.rmdup.bai"]
	mv -vf ${INPUT}.rmdup.bai ${INPUT}.rmdup.bam.bai
}

#[procedure input="${INPUT}.bam"]
#[procedure input="${INPUT}.bam.bai"]
#[procedure output="${INPUT}.mpileup.bz2"]
function samtools_mpileup
{
	SP_set REFERENCE="${REFERENCE}"
	SP_set SAMTOOLS_OPTS="${SAMTOOLS_OPTS} -f ${REFERENCE}" if_has_arg ARG="${REFERENCE}"

	samtools mpileup ${SAMTOOLS_OPTS} ${INPUT}.bam | bzip2 -9c > ${INPUT}.mpileup.bz2
}

#[procedure require="${REFERENCE}"]
#[procedure input="${INPUT}.bam"]
#[procedure input="${INPUT}.bam.bai"]
#[procedure output="${OUTPUT}.samtools.vcf"]
function samtools_call_variants
{
	samtools mpileup -f ${REFERENCE} ${INPUT}.bam -u \
		| bcftools view -vcg - \
		> ${OUTPUT}.samtools.vcf
}

#[procedure input="${INPUT}.bam"]
#[procedure output="${INPUT}.rg.bam"]
function picard_set_read_group
{
	# READ_GROUP_ID   : Read Group ID  Default value: 1. This option can be set to 'null' to clear the default 
	# LIBRARY         : Read Group Library  Required. 
	# PLATFORM        : Read Group platform (e.g. illumina, solid)  Required. 
	# PLATFORM_UNIT   : Read Group platform unit (eg. run barcode)  Required. 
	# SAMPLE_NAME     : Read Group sample name  Required. 
	# SEQ_CENTER_NAME : Read Group sequencing center name  Default value: null. 
	# DESCRIPTION     : Read Group description  Default value: null. 

	SP_set READ_GROUP_ID="1"
	SP_set PLATFORM="illumina"
	SP_set PLATFORM_UNIT="flowcell.lane"
	SP_set SEQ_CENTER_NAME="BGI"
	SP_set DESCRIPTION="${DESCRIPTION}"
	SP_set PICARD_RG_OPTS="${PICARD_RG_OPTS} RGDS=\"${DESCRIPTION}\"" if_has_arg ARG="${DESCRIPTION}"
	SP_set VALIDATION_STRINGENCY="STRICT"

	#[command input="${INPUT}.bam"]
	#[command output="${INPUT}.rg.bam"]
	java ${JAVA_OPTS} -jar ${PICARD_ROOT}/AddOrReplaceReadGroups.jar ${PICARD_OPTS} \
		VALIDATION_STRINGENCY=${VALIDATION_STRINGENCY} \
		COMPRESSION_LEVEL=9 \
		CREATE_INDEX=true \
		CREATE_MD5_FILE=false \
		INPUT=${INPUT}.bam \
		OUTPUT=${INPUT}.rg.bam \
		RGID=${READ_GROUP_ID} \
		RGLB=${LIBRARY} \
		RGPL=${PLATFORM} \
		RGPU=${PLATFORM_UNIT} \
		RGSM=${SAMPLE_NAME} \
		RGCN=${SEQ_CENTER_NAME} \
		${PICARD_RG_OPTS}

	# Rename index file if sorted
	if [ -f ${INPUT}.rg.bai ]; then \
		mv -vf ${INPUT}.rg.bai ${INPUT}.rg.bam.bai; \
	fi
}

#[procedure require="${REFERENCE}"]
#[procedure require="${GATK_VCF_DBSNP}"]
#[procedure input="${INPUT}.bam"]
#[procedure output="${INPUT}.realign.fixmate.bam"]
function gatk_indel_realign
{
	SP_set VALIDATION_STRINGENCY="STRICT"

	#[command require="${REFERENCE}"]
	#[command input="${INPUT}.bam"]
	#[command output="${INPUT}.intervals"]
	java ${JAVA_OPTS} -jar ${GATK_ROOT}/GenomeAnalysisTK.jar ${GATK_OPTS} \
		-T RealignerTargetCreator \
		-R ${REFERENCE} \
		-I ${INPUT}.bam \
		-o ${INPUT}.intervals \
		-known ${GATK_VCF_DBSNP} \
		--num_threads ${THREAD_NUM}

	#[command require="${REFERENCE}"]
	#[command input="${INPUT}.bam"]
	#[command input="${INPUT}.intervals"]
	#[command output="${INPUT}.realign.bam"]
	java ${JAVA_OPTS} -jar ${GATK_ROOT}/GenomeAnalysisTK.jar ${GATK_OPTS} \
		-T IndelRealigner \
		-R ${REFERENCE} \
		-I ${INPUT}.bam \
		-targetIntervals ${INPUT}.intervals \
		-o ${INPUT}.realign.bam
	
	#[command input="${INPUT}.realign.bam"]
	#[command output="${INPUT}.realign.fixmate.bam"]
	#[command output="${INPUT}.realign.fixmate.bai"]
	java ${JAVA_OPTS} -jar ${PICARD_ROOT}/FixMateInformation.jar ${PICARD_OPTS} \
		VALIDATION_STRINGENCY=${VALIDATION_STRINGENCY} \
		COMPRESSION_LEVEL=9 \
		CREATE_INDEX=true \
		CREATE_MD5_FILE=false \
		INPUT=${INPUT}.realign.bam \
		OUTPUT=${INPUT}.realign.fixmate.bam

	#[command require="${INPUT}.realign.fixmate.bai"]
	mv -vf ${INPUT}.realign.fixmate.bai ${INPUT}.realign.fixmate.bam.bai

	#[command require="${INPUT}.realign.bam"]
	rm -vf ${INPUT}.intervals

	#[command require="${INPUT}.realign.fixmate.bam"]
	rm -vf ${INPUT}.realign.ba{m,i}
}

#[procedure require="${REFERENCE}"]
#[procedure require="${GATK_VCF_DBSNP}"]
#[procedure input="${INPUT}.bam"]
#[procedure output="${INPUT}.recal.bam"]
function gatk_recalibrate
{
	#[command require="${REFERENCE}"]
	#[command require="${GATK_VCF_DBSNP}"]
	#[command input="${INPUT}.bam"]
	#[command output="${INPUT}.recal"]
	java ${JAVA_OPTS} \
		-jar ${GATK_ROOT}/GenomeAnalysisTK.jar ${GATK_OPTS} \
		-T CountCovariates \
		-R ${REFERENCE} \
		-I ${INPUT}.bam \
		-knownSites ${GATK_VCF_DBSNP} \
		-cov ReadGroupCovariate \
		-cov QualityScoreCovariate \
		-cov CycleCovariate \
		-cov DinucCovariate \
		-recalFile ${INPUT}.recal \
		--num_threads ${THREAD_NUM}
	
	#[command require="${REFERENCE}"]
	#[command input="${INPUT}.bam"]
	#[command input="${INPUT}.recal"]
	#[command output="${INPUT}.recal.bam"]
	java ${JAVA_OPTS} \
		-jar ${GATK_ROOT}/GenomeAnalysisTK.jar ${GATK_OPTS} \
		-T TableRecalibration \
		-R ${REFERENCE} \
		-I ${INPUT}.bam \
		-recalFile ${INPUT}.recal \
		-o ${INPUT}.recal.bam

	#[command require="${INPUT}.recal.bai"]
	mv -vf ${INPUT}.recal.bai ${INPUT}.recal.bam.bai

	#[command require="${INPUT}.recal.bam"]
	rm -vf ${INPUT}.recal
}

#[procedure require="${REFERENCE}"]
#[procedure require="${GATK_VCF_DBSNP}"]
#[procedure input="${INPUT}.bam"]
#[procedure output="${OUTPUT}.vcf"]
function gatk_genotype
{
	SP_set STAND_CALL_CONF=30
	SP_set STAND_EMIT_CONF=10
	SP_set MIN_BASE_QUALITY_SCORE=20

	#[command require="${REFERENCE}"]
	#[command require="${GATK_VCF_DBSNP}"]
	#[command input="${INPUT}.bam"]
	#[command output="${OUTPUT}.vcf"]
	java ${JAVA_OPTS} \
		-jar ${GATK_ROOT}/GenomeAnalysisTK.jar ${GATK_OPTS} \
		-T UnifiedGenotyper \
		-R ${REFERENCE} \
		-I ${INPUT}.bam \
		-glm BOTH \
		--min_base_quality_score ${MIN_BASE_QUALITY_SCORE} \
		--dbsnp ${GATK_VCF_DBSNP} \
		-stand_call_conf ${STAND_CALL_CONF} \
		-stand_emit_conf ${STAND_EMIT_CONF} \
		-o ${OUTPUT}.vcf \
		--num_threads ${THREAD_NUM}
}

#[procedure require="${REFERENCE}"]
#[procedure input="${INPUT}.vcf"]
#[procedure output="${INPUT}.filtered.vcf"]
function gatk_filter_variants
{
	SP_set FILTER_NAME="Filter"
	SP_set FILTER_EXPRESSION="(AB ?: 0) > 0.75 || QUAL < 50.0 || DP < 10 || DP > 360 || (SB ?: -1) > -0.1 || MQ0 >= 4"

##### Select Different Types of Variants #####
	SP_parallel_begin

	#[command require="${REFERENCE}"]
	#[command input="${INPUT}.vcf"]
	#[command output="${INPUT}.snp.vcf"]
	java ${JAVA_OPTS} \
		-jar ${GATK_ROOT}/GenomeAnalysisTK.jar ${GATK_OPTS} \
		-T SelectVariants \
		-R ${REFERENCE} \
		--variant ${INPUT}.vcf \
		-selectType SNP -selectType MNP \
		-o ${INPUT}.snp.vcf

	#[command require="${REFERENCE}"]
	#[command input="${INPUT}.vcf"]
	#[command output="${INPUT}.indel.vcf"]
	java ${JAVA_OPTS} \
		-jar ${GATK_ROOT}/GenomeAnalysisTK.jar ${GATK_OPTS} \
		-T SelectVariants \
		-R ${REFERENCE} \
		--variant ${INPUT}.vcf \
		-selectType INDEL \
		-o ${INPUT}.indel.vcf

	SP_parallel_end

##### Filter Variants #####
	SP_parallel_begin

	#[command require="${REFERENCE}"]
	#[command input="${INPUT}.snp.vcf"]
	#[command output="${INPUT}.snp.filtered.vcf"]
	java ${JAVA_OPTS} \
		-jar ${GATK_ROOT}/GenomeAnalysisTK.jar ${GATK_OPTS} \
		-T VariantFiltration \
		-R ${REFERENCE} \
		--variant ${INPUT}.snp.vcf \
		--filterExpression "${FILTER_EXPRESSION}" \
		--filterName "${FILTER_NAME}" \
		--clusterWindowSize 10 \
		-o ${INPUT}.snp.filtered.vcf

	#[command require="${REFERENCE}"]
	#[command input="${INPUT}.indel.vcf"]
	#[command output="${INPUT}.indel.filtered.vcf"]
	java ${JAVA_OPTS} \
		-jar ${GATK_ROOT}/GenomeAnalysisTK.jar ${GATK_OPTS} \
		-T VariantFiltration \
		-R ${REFERENCE} \
		--variant ${INPUT}.snp.vcf \
		--filterExpression "${FILTER_EXPRESSION}" \
		--filterName "${FILTER_NAME}" \
		--clusterWindowSize 10 \
		-o ${INPUT}.indel.filtered.vcf

	SP_parallel_end

##### Combine Variants #####
	#[command input="${INPUT}.snp.filtered.vcf"]
	#[command input="${INPUT}.indel.filtered.vcf"]
	#[command output="${INPUT}.filtered.vcf"]
	java ${JAVA_OPTS} \
		-jar ${GATK_ROOT}/GenomeAnalysisTK.jar ${GATK_OPTS} \
		-T CombineVariants \
		-R ${REFERENCE} \
		--variant:SNP ${INPUT}.snp.filtered.vcf \
		--variant:INDEL ${INPUT}.indel.filtered.vcf \
		-o ${INPUT}.filtered.vcf \
		-genotypeMergeOptions UNIQUIFY
}

#[procedure require="${REFERENCE}"]
#[procedure require="${GATK_VCF_DBSNP}"]
#[procedure input="${INPUT}.bam"]
#[procedure output="${OUTPUT}.gatk.vcf"]
function gatk_call_variants
{
	SP_set VALIDATION_STRINGENCY="STRICT"

	SP_run gatk_indel_realign \
		REFERENCE=${REFERENCE} GATK_VCF_DBSNP=${GATK_VCF_DBSNP} \
		INPUT=${INPUT} \
		VALIDATION_STRINGENCY=${VALIDATION_STRINGENCY}

	SP_run gatk_recalibrate \
		REFERENCE=${REFERENCE} GATK_VCF_DBSNP=${GATK_VCF_DBSNP} \
		INPUT=${INPUT}.realign.fixmate

	SP_run gatk_genotype \
		REFERENCE=${REFERENCE} GATK_VCF_DBSNP=${GATK_VCF_DBSNP} \
		INPUT=${INPUT}.realign.fixmate.recal \
		OUTPUT=${OUTPUT}

	SP_run gatk_filter_variants \
		REFERENCE=${REFERENCE} \
		INPUT=${OUTPUT}

	#[command require="${OUTPUT}.filtered.vcf"]
	mv -vf ${OUTPUT}.filtered.vcf ${OUTPUT}.gatk.vcf
}

#[procedure require="${REFERENCE}"]
#[procedure input="${INPUT}.bam"]
#[procedure require="${INPUT}.bam.bai"]
#[procedure output="${OUTPUT}.pindel.vcf"]
function pindel_call_structure_variants
{
	SP_eval REFERENCE_NAME get_file_name FILE=${REFERENCE}
	SP_eval REFERENCE_DATE get_file_date FILE=${REFERENCE}

##### Call Structure Variants #####
	#[command require="${REFERENCE}"]
	#[command input="${INPUT}.bam"]
	#[command output="${OUTPUT}_D"]
	#[command output="${OUTPUT}_SI"]
	#[command output="${OUTPUT}_INV"]
	#[command output="${OUTPUT}_TD"]
	pindel -f ${REFERENCE} \
		-i <(echo "${INPUT}.bam ${INSERT_SIZE} ${SAMPLE_NAME}") \
		-l -k -s -c ALL -o ${OUTPUT}

##### Convert to VCF #####
	SP_parallel_begin

	#[command require="${REFERENCE}"]
	#[command input="${OUTPUT}_D"]
	#[command output="${OUTPUT}_D.vcf"]
	pindel2vcf -r ${REFERENCE} -R ${REFERENCE_NAME} -d ${REFERENCE_DATE} \
		-p ${OUTPUT}_D -v ${OUTPUT}_D.vcf

	#[command require="${REFERENCE}"]
	#[command input="${OUTPUT}_SI"]
	#[command output="${OUTPUT}_SI.vcf"]
	pindel2vcf -r ${REFERENCE} -R ${REFERENCE_NAME} -d ${REFERENCE_DATE} \
		-p ${OUTPUT}_SI -v ${OUTPUT}_SI.vcf
	
	#[command require="${REFERENCE}"]
	#[command input="${OUTPUT}_INV"]
	#[command output="${OUTPUT}_INV.vcf"]
	pindel2vcf -r ${REFERENCE} -R ${REFERENCE_NAME} -d ${REFERENCE_DATE} \
		-p ${OUTPUT}_INV -v ${OUTPUT}_INV.vcf
	
	#[command require="${REFERENCE}"]
	#[command input="${OUTPUT}_TD"]
	#[command output="${OUTPUT}_TD.vcf"]
	pindel2vcf -r ${REFERENCE} -R ${REFERENCE_NAME} -d ${REFERENCE_DATE} \
		-p ${OUTPUT}_TD -v ${OUTPUT}_TD.vcf

	SP_parallel_end

##### Combine Variants #####
	#[command require="${REFERENCE}"]
	#[command input="${OUTPUT}_D.vcf"]
	#[command input="${OUTPUT}_SI.vcf"]
	#[command input="${OUTPUT}_INV.vcf"]
	#[command input="${OUTPUT}_TD.vcf"]
	#[command output="${INPUT}.pindel.vcf"]
	java ${JAVA_OPTS} \
		-jar ${GATK_ROOT}/GenomeAnalysisTK.jar ${GATK_OPTS} \
		-T CombineVariants \
		-R ${REFERENCE} \
		--variant:D ${OUTPUT}_D.vcf \
		--variant:SI ${OUTPUT}_SI.vcf \
		--variant:INV ${OUTPUT}_INV.vcf \
		--variant:TD ${OUTPUT}_TD.vcf \
		-o ${OUTPUT}.pindel.vcf \
		-genotypeMergeOptions UNIQUIFY
}

#[procedure type="pipeline"]
#[procedure require="${REFERENCE}"]
#[procedure input="${FASTQ_1}.fq.gz"]
#[procedure input="${FASTQ_2}.fq.gz"]
#[procedure output="${SAMPLE_NAME}.samtools.vcf"]
#[procedure output="${SAMPLE_NAME}.gatk.vcf"]
#[procedure output="${SAMPLE_NAME}.pindel.vcf"]
#[procedure output="${SAMPLE_NAME}.summary.txt"]
function DNAseq_analysis
{
	SP_set RGID="${RGID}"
	SP_set RGLB="${RGLB}"
	SP_set RGPL="${RGPL}"
	SP_set RGCN="${RGCN}"

##### FastQC #####
	SP_parallel_begin
	SP_run fastqc_check INPUT_FASTQ=${FASTQ_1}.fq.gz OUTPUT=${FASTQ_1}
	SP_run fastqc_check INPUT_FASTQ=${FASTQ_2}.fq.gz OUTPUT=${FASTQ_2}
	SP_parallel_end

##### Read Mapping #####
	SP_run bwa_read_mapping \
		REFERENCE=${REFERENCE} \
		FASTQ_1=${FASTQ_1}.fq.gz \
		FASTQ_2=${FASTQ_2}.fq.gz \
		MAX_INSERT_SIZE=${MAX_INSERT_SIZE} \
		OUTPUT=${SAMPLE_NAME} \
		RGID=${RGID} \
		RGLB=${RGLB} \
		SAMPLE_NAME=${SAMPLE_NAME} \
		RGPL=${RGPL} \
		RGCN=${RGCN}

##### Call Variants #####
	SP_parallel_begin
	#[command input="${SAMPLE_NAME}.bam" output="${SAMPLE_NAME}.mapped.bam"]
	samtools view ${SAMPLE_NAME}.bam -F 4 -b > ${SAMPLE_NAME}.mapped.bam
	#[command input="${SAMPLE_NAME}.bam" output="${SAMPLE_NAME}.unmapped.bam"]
	samtools view ${SAMPLE_NAME}.bam -f 4 -b > ${SAMPLE_NAME}.unmapped.bam
	SP_parallel_end

	SP_run select_unique_hit_reads INPUT=${SAMPLE_NAME}.mapped

	SP_run sort_bam INPUT=${SAMPLE_NAME}.mapped.uniq

	SP_run mark_duplicate_reads INPUT=${SAMPLE_NAME}.mapped.uniq.sorted

	SP_run samtools_call_variants \
		REFERENCE=${REFERENCE} \
		INPUT=${SAMPLE_NAME}.mapped.uniq.sorted.mkdup \
		OUTPUT=${SAMPLE_NAME}

	SP_run gatk_call_variants \
		REFERENCE=${REFERENCE} \
		INPUT=${SAMPLE_NAME}.mapped.uniq.sorted.mkdup \
		OUTPUT=${SAMPLE_NAME}

##### Call Structure Variants #####
	SP_run sort_bam \
		VALIDATION_STRINGENCY=SILENT \
		INPUT=${SAMPLE_NAME} \
		OUTPUT=${SAMPLE_NAME}.sorted

	SP_run pindel_call_structure_variants \
		REFERENCE=${REFERENCE} \
		SAMPLE_NAME=${SAMPLE_NAME} \
		INSERT_SIZE=${INSERT_SIZE} \
		INPUT=${SAMPLE_NAME}.sorted \
		OUTPUT=${SAMPLE_NAME}

##### Report Summary Counts #####
	( \
		echo -n 'Total reads  : '; unzip -p ${FASTQ_1}.fastqc.zip '*/fastqc_data.txt' | grep 'Total Sequences' | cut -f2 | awk '{print $1+$1}'; \
		echo -n 'Mapped reads : '; samtools view -c ${SAMPLE_NAME}.mapped.bam; \
		echo -n 'Unique reads : '; samtools view -c ${SAMPLE_NAME}.mapped.uniq.bam; \
		echo -n 'Non-Duplicate: '; samtools flagstat ${SAMPLE_NAME}.mapped.uniq.sorted.mkdup.bam | head -n2 | awk '{print $1}' | paste - - | awk '{print $1-$2}'; \
		echo -n 'Variants (gatk)    :'; cat ${SAMPLE_NAME}.gatk.vcf | grep -v ^# | wc -l; \
		echo -n 'Variants (samtools):'; cat ${SAMPLE_NAME}.samtools.vcf | grep -v ^# | wc -l; \
		echo -n 'Variants (pindel)  :'; cat ${SAMPLE_NAME}.pindel.vcf | grep -v ^# | wc -l; \
	) > ${SAMPLE_NAME}.summary.txt

##### Remove Intermediate Files #####
	#[command require="${SAMPLE_NAME}.mapped.uniq.bam"]
	#[command require="${SAMPLE_NAME}.mapped.non_uniq.bam"]
	rm -vf ${SAMPLE_NAME}.mapped.bam
	#[command require="${SAMPLE_NAME}.mapped.uniq.sorted.bam"]
	rm -vf ${SAMPLE_NAME}.mapped.uniq.bam
	#[command require="${SAMPLE_NAME}.mapped.uniq.sorted.mkdup.bam"]
	rm -vf ${SAMPLE_NAME}.mapped.uniq.sorted.bam{,.bai}
}
